<!doctype html>

<html>
<head>
  <link rel="shortcut icon" href="static/images/favicon.ico" type="image/x-icon">
  <title>htmlparser.js (Closure Library API Documentation - JavaScript)</title>
  <link rel="stylesheet" href="static/css/base.css">
  <link rel="stylesheet" href="static/css/doc.css">
  <link rel="stylesheet" href="static/css/sidetree.css">
  <link rel="stylesheet" href="static/css/prettify.css">

  <script>
     var _staticFilePath = "static/";
     var _typeTreeName = "goog";
     var _fileTreeName = "Source";
  </script>

  <script src="static/js/doc.js">
  </script>


  <meta charset="utf8">
</head>

<body onload="grokdoc.onLoad();">

<div id="header">
  <div class="g-section g-tpl-50-50 g-split">
    <div class="g-unit g-first">
      <a id="logo" href="index.html">Closure Library API Documentation</a>
    </div>

    <div class="g-unit">
      <div class="g-c">
        <strong>Go to class or file:</strong>
        <input type="text" id="ac">
      </div>
    </div>
  </div>
</div>

<div class="clear"></div>

<h2><a href="closure_third_party_closure_goog_caja_string_html_htmlparser.js.html">htmlparser.js</a></h2>

<pre class="prettyprint lang-js">
<a name="line1"></a>// Copyright 2006-2008, The Google Caja project.
<a name="line2"></a>// Modifications Copyright 2009 The Closure Library Authors. All Rights Reserved.
<a name="line3"></a>// All Rights Reserved
<a name="line4"></a>
<a name="line5"></a>/**
<a name="line6"></a> * @license Portions of this code are from the google-caja project, received by
<a name="line7"></a> * Google under the Apache license (http://code.google.com/p/google-caja/).
<a name="line8"></a> * All other code is Copyright 2009 Google, Inc. All Rights Reserved.
<a name="line9"></a>
<a name="line10"></a>// Copyright (C) 2006 Google Inc.
<a name="line11"></a>//
<a name="line12"></a>// Licensed under the Apache License, Version 2.0 (the &quot;License&quot;);
<a name="line13"></a>// you may not use this file except in compliance with the License.
<a name="line14"></a>// You may obtain a copy of the License at
<a name="line15"></a>//
<a name="line16"></a>//      http://www.apache.org/licenses/LICENSE-2.0
<a name="line17"></a>//
<a name="line18"></a>// Unless required by applicable law or agreed to in writing, software
<a name="line19"></a>// distributed under the License is distributed on an &quot;AS IS&quot; BASIS,
<a name="line20"></a>// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
<a name="line21"></a>// See the License for the specific language governing permissions and
<a name="line22"></a>// limitations under the License.
<a name="line23"></a>
<a name="line24"></a> */
<a name="line25"></a>
<a name="line26"></a>/**
<a name="line27"></a> * @fileoverview A Html SAX parser.
<a name="line28"></a> *
<a name="line29"></a> * Examples of usage of the {@code goog.string.html.HtmlParser}:
<a name="line30"></a> * &lt;pre&gt;
<a name="line31"></a> *   var handler = new MyCustomHtmlVisitorHandlerThatExtendsHtmlSaxHandler();
<a name="line32"></a> *   var parser = new goog.string.html.HtmlParser();
<a name="line33"></a> *   parser.parse(handler, &#39;&lt;html&gt;&lt;a href=&quot;google.com&quot;&gt;link found!&lt;/a&gt;&lt;/html&gt;&#39;);
<a name="line34"></a> * &lt;/pre&gt;
<a name="line35"></a> *
<a name="line36"></a> * TODO(user, msamuel): validate sanitizer regex against the HTML5 grammar at
<a name="line37"></a> * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
<a name="line38"></a> * http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
<a name="line39"></a> * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
<a name="line40"></a> * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
<a name="line41"></a> *
<a name="line42"></a> * @supported IE6, IE7, IE8, FF1.5, FF2, FF3, Chrome 3.0, Safari and Opera 10.
<a name="line43"></a> */
<a name="line44"></a>
<a name="line45"></a>goog.provide(&#39;goog.string.html.HtmlParser&#39;);
<a name="line46"></a>goog.provide(&#39;goog.string.html.HtmlParser.EFlags&#39;);
<a name="line47"></a>goog.provide(&#39;goog.string.html.HtmlParser.Elements&#39;);
<a name="line48"></a>goog.provide(&#39;goog.string.html.HtmlParser.Entities&#39;);
<a name="line49"></a>goog.provide(&#39;goog.string.html.HtmlSaxHandler&#39;);
<a name="line50"></a>
<a name="line51"></a>
<a name="line52"></a>/**
<a name="line53"></a> * An Html parser: {@code parse} takes a string and calls methods on
<a name="line54"></a> * {@code goog.string.html.HtmlSaxHandler} while it is visiting it.
<a name="line55"></a> *
<a name="line56"></a> * @constructor
<a name="line57"></a> */
<a name="line58"></a>goog.string.html.HtmlParser = function() {
<a name="line59"></a>};
<a name="line60"></a>
<a name="line61"></a>
<a name="line62"></a>/**
<a name="line63"></a> * HTML entities that are encoded/decoded.
<a name="line64"></a> * TODO(user): use {@code goog.string.htmlEncode} instead.
<a name="line65"></a> * @enum {string}
<a name="line66"></a> */
<a name="line67"></a>goog.string.html.HtmlParser.Entities = {
<a name="line68"></a>  lt: &#39;&lt;&#39;,
<a name="line69"></a>  gt: &#39;&gt;&#39;,
<a name="line70"></a>  amp: &#39;&amp;&#39;,
<a name="line71"></a>  nbsp: &#39;\240&#39;,
<a name="line72"></a>  quot: &#39;&quot;&#39;,
<a name="line73"></a>  apos: &#39;\&#39;&#39;
<a name="line74"></a>};
<a name="line75"></a>
<a name="line76"></a>
<a name="line77"></a>/**
<a name="line78"></a> * The html eflags, used internally on the parser.
<a name="line79"></a> * @enum {number}
<a name="line80"></a> */
<a name="line81"></a>goog.string.html.HtmlParser.EFlags = {
<a name="line82"></a>  OPTIONAL_ENDTAG: 1,
<a name="line83"></a>  EMPTY: 2,
<a name="line84"></a>  CDATA: 4,
<a name="line85"></a>  RCDATA: 8,
<a name="line86"></a>  UNSAFE: 16,
<a name="line87"></a>  FOLDABLE: 32
<a name="line88"></a>};
<a name="line89"></a>
<a name="line90"></a>
<a name="line91"></a>/**
<a name="line92"></a> * A map of element to a bitmap of flags it has, used internally on the parser.
<a name="line93"></a> * @type {Object}
<a name="line94"></a> */
<a name="line95"></a>goog.string.html.HtmlParser.Elements = {
<a name="line96"></a>  &#39;a&#39;: 0,
<a name="line97"></a>  &#39;abbr&#39;: 0,
<a name="line98"></a>  &#39;acronym&#39;: 0,
<a name="line99"></a>  &#39;address&#39;: 0,
<a name="line100"></a>  &#39;applet&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line101"></a>  &#39;area&#39;: goog.string.html.HtmlParser.EFlags.EMPTY,
<a name="line102"></a>  &#39;b&#39;: 0,
<a name="line103"></a>  &#39;base&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line104"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line105"></a>  &#39;basefont&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line106"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line107"></a>  &#39;bdo&#39;: 0,
<a name="line108"></a>  &#39;big&#39;: 0,
<a name="line109"></a>  &#39;blockquote&#39;: 0,
<a name="line110"></a>  &#39;body&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG |
<a name="line111"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line112"></a>      goog.string.html.HtmlParser.EFlags.FOLDABLE,
<a name="line113"></a>  &#39;br&#39;: goog.string.html.HtmlParser.EFlags.EMPTY,
<a name="line114"></a>  &#39;button&#39;: 0,
<a name="line115"></a>  &#39;caption&#39;: 0,
<a name="line116"></a>  &#39;center&#39;: 0,
<a name="line117"></a>  &#39;cite&#39;: 0,
<a name="line118"></a>  &#39;code&#39;: 0,
<a name="line119"></a>  &#39;col&#39;: goog.string.html.HtmlParser.EFlags.EMPTY,
<a name="line120"></a>  &#39;colgroup&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line121"></a>  &#39;dd&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line122"></a>  &#39;del&#39;: 0,
<a name="line123"></a>  &#39;dfn&#39;: 0,
<a name="line124"></a>  &#39;dir&#39;: 0,
<a name="line125"></a>  &#39;div&#39;: 0,
<a name="line126"></a>  &#39;dl&#39;: 0,
<a name="line127"></a>  &#39;dt&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line128"></a>  &#39;em&#39;: 0,
<a name="line129"></a>  &#39;fieldset&#39;: 0,
<a name="line130"></a>  &#39;font&#39;: 0,
<a name="line131"></a>  &#39;form&#39;: 0,
<a name="line132"></a>  &#39;frame&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line133"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line134"></a>  &#39;frameset&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line135"></a>  &#39;h1&#39;: 0,
<a name="line136"></a>  &#39;h2&#39;: 0,
<a name="line137"></a>  &#39;h3&#39;: 0,
<a name="line138"></a>  &#39;h4&#39;: 0,
<a name="line139"></a>  &#39;h5&#39;: 0,
<a name="line140"></a>  &#39;h6&#39;: 0,
<a name="line141"></a>  &#39;head&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG |
<a name="line142"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line143"></a>      goog.string.html.HtmlParser.EFlags.FOLDABLE,
<a name="line144"></a>  &#39;hr&#39;: goog.string.html.HtmlParser.EFlags.EMPTY,
<a name="line145"></a>  &#39;html&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG |
<a name="line146"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line147"></a>      goog.string.html.HtmlParser.EFlags.FOLDABLE,
<a name="line148"></a>  &#39;i&#39;: 0,
<a name="line149"></a>  &#39;iframe&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line150"></a>      goog.string.html.HtmlParser.EFlags.CDATA,
<a name="line151"></a>  &#39;img&#39;: goog.string.html.HtmlParser.EFlags.EMPTY,
<a name="line152"></a>  &#39;input&#39;: goog.string.html.HtmlParser.EFlags.EMPTY,
<a name="line153"></a>  &#39;ins&#39;: 0,
<a name="line154"></a>  &#39;isindex&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line155"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line156"></a>  &#39;kbd&#39;: 0,
<a name="line157"></a>  &#39;label&#39;: 0,
<a name="line158"></a>  &#39;legend&#39;: 0,
<a name="line159"></a>  &#39;li&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line160"></a>  &#39;link&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line161"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line162"></a>  &#39;map&#39;: 0,
<a name="line163"></a>  &#39;menu&#39;: 0,
<a name="line164"></a>  &#39;meta&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line165"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line166"></a>  &#39;noframes&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line167"></a>      goog.string.html.HtmlParser.EFlags.CDATA,
<a name="line168"></a>  &#39;noscript&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line169"></a>      goog.string.html.HtmlParser.EFlags.CDATA,
<a name="line170"></a>  &#39;object&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line171"></a>  &#39;ol&#39;: 0,
<a name="line172"></a>  &#39;optgroup&#39;: 0,
<a name="line173"></a>  &#39;option&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line174"></a>  &#39;p&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line175"></a>  &#39;param&#39;: goog.string.html.HtmlParser.EFlags.EMPTY |
<a name="line176"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line177"></a>  &#39;pre&#39;: 0,
<a name="line178"></a>  &#39;q&#39;: 0,
<a name="line179"></a>  &#39;s&#39;: 0,
<a name="line180"></a>  &#39;samp&#39;: 0,
<a name="line181"></a>  &#39;script&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line182"></a>      goog.string.html.HtmlParser.EFlags.CDATA,
<a name="line183"></a>  &#39;select&#39;: 0,
<a name="line184"></a>  &#39;small&#39;: 0,
<a name="line185"></a>  &#39;span&#39;: 0,
<a name="line186"></a>  &#39;strike&#39;: 0,
<a name="line187"></a>  &#39;strong&#39;: 0,
<a name="line188"></a>  &#39;style&#39;: goog.string.html.HtmlParser.EFlags.UNSAFE |
<a name="line189"></a>      goog.string.html.HtmlParser.EFlags.CDATA,
<a name="line190"></a>  &#39;sub&#39;: 0,
<a name="line191"></a>  &#39;sup&#39;: 0,
<a name="line192"></a>  &#39;table&#39;: 0,
<a name="line193"></a>  &#39;tbody&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line194"></a>  &#39;td&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line195"></a>  &#39;textarea&#39;: goog.string.html.HtmlParser.EFlags.RCDATA,
<a name="line196"></a>  &#39;tfoot&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line197"></a>  &#39;th&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line198"></a>  &#39;thead&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line199"></a>  &#39;title&#39;: goog.string.html.HtmlParser.EFlags.RCDATA |
<a name="line200"></a>      goog.string.html.HtmlParser.EFlags.UNSAFE,
<a name="line201"></a>  &#39;tr&#39;: goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG,
<a name="line202"></a>  &#39;tt&#39;: 0,
<a name="line203"></a>  &#39;u&#39;: 0,
<a name="line204"></a>  &#39;ul&#39;: 0,
<a name="line205"></a>  &#39;var&#39;: 0
<a name="line206"></a>};
<a name="line207"></a>
<a name="line208"></a>
<a name="line209"></a>/**
<a name="line210"></a> * Regular expression that matches &amp;s.
<a name="line211"></a> * @type {RegExp}
<a name="line212"></a> * @private
<a name="line213"></a> */
<a name="line214"></a>goog.string.html.HtmlParser.AMP_RE_ = /&amp;/g;
<a name="line215"></a>
<a name="line216"></a>
<a name="line217"></a>/**
<a name="line218"></a> * Regular expression that matches loose &amp;s.
<a name="line219"></a> * @type {RegExp}
<a name="line220"></a> * @private
<a name="line221"></a> */
<a name="line222"></a>goog.string.html.HtmlParser.LOOSE_AMP_RE_ =
<a name="line223"></a>    /&amp;([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
<a name="line224"></a>
<a name="line225"></a>
<a name="line226"></a>/**
<a name="line227"></a> * Regular expression that matches &lt;.
<a name="line228"></a> * @type {RegExp}
<a name="line229"></a> * @private
<a name="line230"></a> */
<a name="line231"></a>goog.string.html.HtmlParser.LT_RE_ = /&lt;/g;
<a name="line232"></a>
<a name="line233"></a>
<a name="line234"></a>/**
<a name="line235"></a> * Regular expression that matches &gt;.
<a name="line236"></a> * @type {RegExp}
<a name="line237"></a> * @private
<a name="line238"></a> */
<a name="line239"></a>goog.string.html.HtmlParser.GT_RE_ = /&gt;/g;
<a name="line240"></a>
<a name="line241"></a>
<a name="line242"></a>/**
<a name="line243"></a> * Regular expression that matches &quot;.
<a name="line244"></a> * @type {RegExp}
<a name="line245"></a> * @private
<a name="line246"></a> */
<a name="line247"></a>goog.string.html.HtmlParser.QUOTE_RE_ = /\&quot;/g;
<a name="line248"></a>
<a name="line249"></a>
<a name="line250"></a>/**
<a name="line251"></a> * Regular expression that matches =.
<a name="line252"></a> * @type {RegExp}
<a name="line253"></a> * @private
<a name="line254"></a> */
<a name="line255"></a>goog.string.html.HtmlParser.EQUALS_RE_ = /=/g;
<a name="line256"></a>
<a name="line257"></a>
<a name="line258"></a>/**
<a name="line259"></a> * Regular expression that matches null characters.
<a name="line260"></a> * @type {RegExp}
<a name="line261"></a> * @private
<a name="line262"></a> */
<a name="line263"></a>goog.string.html.HtmlParser.NULL_RE_ = /\0/g;
<a name="line264"></a>
<a name="line265"></a>
<a name="line266"></a>/**
<a name="line267"></a> * Regular expression that matches entities.
<a name="line268"></a> * @type {RegExp}
<a name="line269"></a> * @private
<a name="line270"></a> */
<a name="line271"></a>goog.string.html.HtmlParser.ENTITY_RE_ = /&amp;(#\d+|#x[0-9A-Fa-f]+|\w+);/g;
<a name="line272"></a>
<a name="line273"></a>
<a name="line274"></a>/**
<a name="line275"></a> * Regular expression that matches decimal numbers.
<a name="line276"></a> * @type {RegExp}
<a name="line277"></a> * @private
<a name="line278"></a> */
<a name="line279"></a>goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_ = /^#(\d+)$/;
<a name="line280"></a>
<a name="line281"></a>
<a name="line282"></a>/**
<a name="line283"></a> * Regular expression that matches hexadecimal numbers.
<a name="line284"></a> * @type {RegExp}
<a name="line285"></a> * @private
<a name="line286"></a> */
<a name="line287"></a>goog.string.html.HtmlParser.HEX_ESCAPE_RE_ = /^#x([0-9A-Fa-f]+)$/;
<a name="line288"></a>
<a name="line289"></a>
<a name="line290"></a>/**
<a name="line291"></a> * Regular expression that matches the next token to be processed.
<a name="line292"></a> * @type {RegExp}
<a name="line293"></a> * @private
<a name="line294"></a> */
<a name="line295"></a>goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ = new RegExp(
<a name="line296"></a>    // Don&#39;t capture space.
<a name="line297"></a>    &#39;^\\s*(?:&#39; +
<a name="line298"></a>    // Capture an attribute name in group 1, and value in group 3.
<a name="line299"></a>    // We capture the fact that there was an attribute in group 2, since
<a name="line300"></a>    // interpreters are inconsistent in whether a group that matches nothing
<a name="line301"></a>    // is null, undefined, or the empty string.
<a name="line302"></a>    (&#39;(?:&#39; +
<a name="line303"></a>       &#39;([a-z][a-z-]*)&#39; +                   // attribute name
<a name="line304"></a>       (&#39;(&#39; +                               // optionally followed
<a name="line305"></a>          &#39;\\s*=\\s*&#39; +
<a name="line306"></a>          (&#39;(&#39; +
<a name="line307"></a>             // A double quoted string.
<a name="line308"></a>             &#39;\&quot;[^\&quot;]*\&quot;&#39; +
<a name="line309"></a>             // A single quoted string.
<a name="line310"></a>             &#39;|\&#39;[^\&#39;]*\&#39;&#39; +
<a name="line311"></a>             // The positive lookahead is used to make sure that in
<a name="line312"></a>             // &lt;foo bar= baz=boo&gt;, the value for bar is blank, not &quot;baz=boo&quot;.
<a name="line313"></a>             &#39;|(?=[a-z][a-z-]*\\s*=)&#39; +
<a name="line314"></a>             // An unquoted value that is not an attribute name.
<a name="line315"></a>             // We know it is not an attribute name because the previous
<a name="line316"></a>             // zero-width match would&#39;ve eliminated that possibility.
<a name="line317"></a>             &#39;|[^&gt;\&quot;\&#39;\\s]*&#39; +
<a name="line318"></a>             &#39;)&#39;
<a name="line319"></a>             ) +
<a name="line320"></a>          &#39;)&#39;
<a name="line321"></a>          ) + &#39;?&#39; +
<a name="line322"></a>       &#39;)&#39;
<a name="line323"></a>       ) +
<a name="line324"></a>    // End of tag captured in group 3.
<a name="line325"></a>    &#39;|(/?&gt;)&#39; +
<a name="line326"></a>    // Don&#39;t capture cruft
<a name="line327"></a>    &#39;|[^a-z\\s&gt;]+)&#39;,
<a name="line328"></a>    &#39;i&#39;);
<a name="line329"></a>
<a name="line330"></a>
<a name="line331"></a>/**
<a name="line332"></a> * Regular expression that matches the next token to be processed when we are
<a name="line333"></a> * outside a tag.
<a name="line334"></a> * @type {RegExp}
<a name="line335"></a> * @private
<a name="line336"></a> */
<a name="line337"></a>goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_ = new RegExp(
<a name="line338"></a>    &#39;^(?:&#39; +
<a name="line339"></a>    // Entity captured in group 1.
<a name="line340"></a>    &#39;&amp;(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);&#39; +
<a name="line341"></a>    // Comment, doctypes, and processing instructions not captured.
<a name="line342"></a>    &#39;|&lt;[!]--[\\s\\S]*?--&gt;|&lt;!\\w[^&gt;]*&gt;|&lt;\\?[^&gt;*]*&gt;&#39; +
<a name="line343"></a>    // &#39;/&#39; captured in group 2 for close tags, and name captured in group 3.
<a name="line344"></a>    &#39;|&lt;(/)?([a-z][a-z0-9]*)&#39; +
<a name="line345"></a>    // Text captured in group 4.
<a name="line346"></a>    &#39;|([^&lt;&amp;&gt;]+)&#39; +
<a name="line347"></a>    // Cruft captured in group 5.
<a name="line348"></a>    &#39;|([&lt;&amp;&gt;]))&#39;,
<a name="line349"></a>    &#39;i&#39;);
<a name="line350"></a>
<a name="line351"></a>
<a name="line352"></a>/**
<a name="line353"></a> * Given a SAX-like {@code goog.string.html.HtmlSaxHandler} parses a
<a name="line354"></a> * {@code htmlText} and lets the {@code handler} know the structure while
<a name="line355"></a> * visiting the nodes.
<a name="line356"></a> *
<a name="line357"></a> * @param {goog.string.html.HtmlSaxHandler} handler The HtmlSaxHandler that will
<a name="line358"></a> *     receive the events.
<a name="line359"></a> * @param {string} htmlText The html text.
<a name="line360"></a> */
<a name="line361"></a>goog.string.html.HtmlParser.prototype.parse = function(handler, htmlText) {
<a name="line362"></a>  var htmlLower = null;
<a name="line363"></a>  var inTag = false;  // True iff we&#39;re currently processing a tag.
<a name="line364"></a>  var attribs = [];  // Accumulates attribute names and values.
<a name="line365"></a>  var tagName;  // The name of the tag currently being processed.
<a name="line366"></a>  var eflags;  // The element flags for the current tag.
<a name="line367"></a>  var openTag;  // True if the current tag is an open tag.
<a name="line368"></a>
<a name="line369"></a>  // Lets the handler know that we are starting to parse the document.
<a name="line370"></a>  handler.startDoc();
<a name="line371"></a>
<a name="line372"></a>  // Consumes tokens from the htmlText and stops once all tokens are processed.
<a name="line373"></a>  while (htmlText) {
<a name="line374"></a>    var regex = inTag ?
<a name="line375"></a>        goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ :
<a name="line376"></a>        goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_;
<a name="line377"></a>    // Gets the next token
<a name="line378"></a>    var m = htmlText.match(regex);
<a name="line379"></a>    // And removes it from the string
<a name="line380"></a>    htmlText = htmlText.substring(m[0].length);
<a name="line381"></a>
<a name="line382"></a>    // TODO(goto): cleanup this code breaking it into separate methods.
<a name="line383"></a>    if (inTag) {
<a name="line384"></a>      if (m[1]) { // Attribute.
<a name="line385"></a>        // SetAttribute with uppercase names doesn&#39;t work on IE6.
<a name="line386"></a>        var attribName = goog.string.html.toLowerCase(m[1]);
<a name="line387"></a>        var decodedValue;
<a name="line388"></a>        if (m[2]) {
<a name="line389"></a>          var encodedValue = m[3];
<a name="line390"></a>          switch (encodedValue.charCodeAt(0)) {  // Strip quotes.
<a name="line391"></a>            case 34: case 39:
<a name="line392"></a>              encodedValue = encodedValue.substring(
<a name="line393"></a>                  1, encodedValue.length - 1);
<a name="line394"></a>              break;
<a name="line395"></a>          }
<a name="line396"></a>          decodedValue = this.unescapeEntities_(this.stripNULs_(encodedValue));
<a name="line397"></a>        } else {
<a name="line398"></a>          // Use name as value for valueless attribs, so
<a name="line399"></a>          //   &lt;input type=checkbox checked&gt;
<a name="line400"></a>          // gets attributes [&#39;type&#39;, &#39;checkbox&#39;, &#39;checked&#39;, &#39;checked&#39;]
<a name="line401"></a>          decodedValue = attribName;
<a name="line402"></a>        }
<a name="line403"></a>        attribs.push(attribName, decodedValue);
<a name="line404"></a>      } else if (m[4]) {
<a name="line405"></a>        if (eflags !== void 0) {  // False if not in whitelist.
<a name="line406"></a>          if (openTag) {
<a name="line407"></a>            if (handler.startTag) {
<a name="line408"></a>              handler.startTag(/** @type {string} */ (tagName), attribs);
<a name="line409"></a>            }
<a name="line410"></a>          } else {
<a name="line411"></a>            if (handler.endTag) {
<a name="line412"></a>              handler.endTag(/** @type {string} */ (tagName));
<a name="line413"></a>            }
<a name="line414"></a>          }
<a name="line415"></a>        }
<a name="line416"></a>
<a name="line417"></a>        if (openTag &amp;&amp; (eflags &amp;
<a name="line418"></a>            (goog.string.html.HtmlParser.EFlags.CDATA |
<a name="line419"></a>             goog.string.html.HtmlParser.EFlags.RCDATA))) {
<a name="line420"></a>          if (htmlLower === null) {
<a name="line421"></a>            htmlLower = goog.string.html.toLowerCase (htmlText);
<a name="line422"></a>          } else {
<a name="line423"></a>           htmlLower = htmlLower.substring(
<a name="line424"></a>                htmlLower.length - htmlText.length);
<a name="line425"></a>          }
<a name="line426"></a>          var dataEnd = htmlLower.indexOf(&#39;&lt;/&#39; + tagName);
<a name="line427"></a>          if (dataEnd &lt; 0) {
<a name="line428"></a>            dataEnd = htmlText.length;
<a name="line429"></a>          }
<a name="line430"></a>          if (eflags &amp; goog.string.html.HtmlParser.EFlags.CDATA) {
<a name="line431"></a>            if (handler.cdata) {
<a name="line432"></a>              handler.cdata(htmlText.substring(0, dataEnd));
<a name="line433"></a>            }
<a name="line434"></a>          } else if (handler.rcdata) {
<a name="line435"></a>            handler.rcdata(
<a name="line436"></a>                this.normalizeRCData_(htmlText.substring(0, dataEnd)));
<a name="line437"></a>          }
<a name="line438"></a>          htmlText = htmlText.substring(dataEnd);
<a name="line439"></a>        }
<a name="line440"></a>
<a name="line441"></a>        tagName = eflags = openTag = void 0;
<a name="line442"></a>        attribs.length = 0;
<a name="line443"></a>        inTag = false;
<a name="line444"></a>      }
<a name="line445"></a>    } else {
<a name="line446"></a>      if (m[1]) {  // Entity.
<a name="line447"></a>        handler.pcdata(m[0]);
<a name="line448"></a>      } else if (m[3]) {  // Tag.
<a name="line449"></a>        openTag = !m[2];
<a name="line450"></a>        inTag = true;
<a name="line451"></a>        tagName = goog.string.html.toLowerCase (m[3]);
<a name="line452"></a>        eflags = goog.string.html.HtmlParser.Elements.hasOwnProperty(tagName) ?
<a name="line453"></a>            goog.string.html.HtmlParser.Elements[tagName] : void 0;
<a name="line454"></a>      } else if (m[4]) {  // Text.
<a name="line455"></a>        handler.pcdata(m[4]);
<a name="line456"></a>      } else if (m[5]) {  // Cruft.
<a name="line457"></a>        switch (m[5]) {
<a name="line458"></a>          case &#39;&lt;&#39;: handler.pcdata(&#39;&amp;lt;&#39;); break;
<a name="line459"></a>          case &#39;&gt;&#39;: handler.pcdata(&#39;&amp;gt;&#39;); break;
<a name="line460"></a>          default: handler.pcdata(&#39;&amp;amp;&#39;); break;
<a name="line461"></a>        }
<a name="line462"></a>      }
<a name="line463"></a>    }
<a name="line464"></a>  }
<a name="line465"></a>
<a name="line466"></a>  // Lets the handler know that we are done parsing the document.
<a name="line467"></a>  handler.endDoc();
<a name="line468"></a>};
<a name="line469"></a>
<a name="line470"></a>
<a name="line471"></a>/**
<a name="line472"></a> * Decodes an HTML entity.
<a name="line473"></a> *
<a name="line474"></a> * @param {string} name The content between the &#39;&amp;&#39; and the &#39;;&#39;.
<a name="line475"></a> * @return {string} A single unicode code-point as a string.
<a name="line476"></a> * @private
<a name="line477"></a> */
<a name="line478"></a>goog.string.html.HtmlParser.prototype.lookupEntity_ = function(name) {
<a name="line479"></a>  // TODO(goto): use {goog.string.htmlDecode} instead ?
<a name="line480"></a>  // TODO(goto): &amp;pi; is different from &amp;Pi;
<a name="line481"></a>  name = goog.string.html.toLowerCase(name);
<a name="line482"></a>  if (goog.string.html.HtmlParser.Entities.hasOwnProperty(name)) {
<a name="line483"></a>    return goog.string.html.HtmlParser.Entities[name];
<a name="line484"></a>  }
<a name="line485"></a>  var m = name.match(goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_);
<a name="line486"></a>  if (m) {
<a name="line487"></a>    return String.fromCharCode(parseInt(m[1], 10));
<a name="line488"></a>  } else if (
<a name="line489"></a>      !!(m = name.match(goog.string.html.HtmlParser.HEX_ESCAPE_RE_))) {
<a name="line490"></a>    return String.fromCharCode(parseInt(m[1], 16));
<a name="line491"></a>  }
<a name="line492"></a>  return &#39;&#39;;
<a name="line493"></a>};
<a name="line494"></a>
<a name="line495"></a>
<a name="line496"></a>/**
<a name="line497"></a> * Removes null characters on the string.
<a name="line498"></a> * @param {string} s The string to have the null characters removed.
<a name="line499"></a> * @return {string} A string without null characters.
<a name="line500"></a> * @private
<a name="line501"></a> */
<a name="line502"></a>goog.string.html.HtmlParser.prototype.stripNULs_ = function(s) {
<a name="line503"></a>  return s.replace(goog.string.html.HtmlParser.NULL_RE_, &#39;&#39;);
<a name="line504"></a>};
<a name="line505"></a>
<a name="line506"></a>
<a name="line507"></a>/**
<a name="line508"></a> * The plain text of a chunk of HTML CDATA which possibly containing.
<a name="line509"></a> *
<a name="line510"></a> * TODO(goto): use {@code goog.string.unescapeEntities} instead ?
<a name="line511"></a> * @param {string} s A chunk of HTML CDATA.  It must not start or end inside
<a name="line512"></a> *   an HTML entity.
<a name="line513"></a> * @return {string} The unescaped entities.
<a name="line514"></a> * @private
<a name="line515"></a> */
<a name="line516"></a>goog.string.html.HtmlParser.prototype.unescapeEntities_ = function(s) {
<a name="line517"></a>  return s.replace(
<a name="line518"></a>      goog.string.html.HtmlParser.ENTITY_RE_,
<a name="line519"></a>      goog.bind(this.lookupEntity_, this));
<a name="line520"></a>};
<a name="line521"></a>
<a name="line522"></a>
<a name="line523"></a>/**
<a name="line524"></a> * Escape entities in RCDATA that can be escaped without changing the meaning.
<a name="line525"></a> * @param {string} rcdata The RCDATA string we want to normalize.
<a name="line526"></a> * @return {string} A normalized version of RCDATA.
<a name="line527"></a> * @private
<a name="line528"></a> */
<a name="line529"></a>goog.string.html.HtmlParser.prototype.normalizeRCData_ = function(rcdata) {
<a name="line530"></a>  return rcdata.
<a name="line531"></a>      replace(goog.string.html.HtmlParser.LOOSE_AMP_RE_, &#39;&amp;amp;$1&#39;).
<a name="line532"></a>      replace(goog.string.html.HtmlParser.LT_RE_, &#39;&amp;lt;&#39;).
<a name="line533"></a>      replace(goog.string.html.HtmlParser.GT_RE_, &#39;&amp;gt;&#39;);
<a name="line534"></a>};
<a name="line535"></a>
<a name="line536"></a>
<a name="line537"></a>/**
<a name="line538"></a> * TODO(goto): why isn&#39;t this in the string package ? does this solves any
<a name="line539"></a> * real problem ? move it to the goog.string package if it does.
<a name="line540"></a> *
<a name="line541"></a> * @param {string} str The string to lower case.
<a name="line542"></a> * @return {string} The str in lower case format.
<a name="line543"></a> */
<a name="line544"></a>goog.string.html.toLowerCase = function(str) {
<a name="line545"></a>  // The below may not be true on browsers in the Turkish locale.
<a name="line546"></a>  if (&#39;script&#39; === &#39;SCRIPT&#39;.toLowerCase()) {
<a name="line547"></a>    return str.toLowerCase();
<a name="line548"></a>  } else {
<a name="line549"></a>    return str.replace(/[A-Z]/g, function(ch) {
<a name="line550"></a>      return String.fromCharCode(ch.charCodeAt(0) | 32);
<a name="line551"></a>    });
<a name="line552"></a>  }
<a name="line553"></a>};
<a name="line554"></a>
<a name="line555"></a>
<a name="line556"></a>/**
<a name="line557"></a> * An interface to the {@code goog.string.html.HtmlParser} visitor, that gets
<a name="line558"></a> * called while the HTML is being parsed.
<a name="line559"></a> *
<a name="line560"></a> * @constructor
<a name="line561"></a> */
<a name="line562"></a>goog.string.html.HtmlSaxHandler = function() {
<a name="line563"></a>};
<a name="line564"></a>
<a name="line565"></a>
<a name="line566"></a>/**
<a name="line567"></a> * Handler called when the parser found a new tag.
<a name="line568"></a> * @param {string} name The name of the tag that is starting.
<a name="line569"></a> * @param {Array.&lt;string&gt;} attributes The attributes of the tag.
<a name="line570"></a> */
<a name="line571"></a>goog.string.html.HtmlSaxHandler.prototype.startTag = goog.abstractMethod;
<a name="line572"></a>
<a name="line573"></a>
<a name="line574"></a>/**
<a name="line575"></a> * Handler called when the parser found a closing tag.
<a name="line576"></a> * @param {string} name The name of the tag that is ending.
<a name="line577"></a> */
<a name="line578"></a>goog.string.html.HtmlSaxHandler.prototype.endTag = goog.abstractMethod;
<a name="line579"></a>
<a name="line580"></a>
<a name="line581"></a>/**
<a name="line582"></a> * Handler called when PCDATA is found.
<a name="line583"></a> * @param {string} text The PCDATA text found.
<a name="line584"></a> */
<a name="line585"></a>goog.string.html.HtmlSaxHandler.prototype.pcdata = goog.abstractMethod;
<a name="line586"></a>
<a name="line587"></a>
<a name="line588"></a>/**
<a name="line589"></a> * Handler called when RCDATA is found.
<a name="line590"></a> * @param {string} text The RCDATA text found.
<a name="line591"></a> */
<a name="line592"></a>goog.string.html.HtmlSaxHandler.prototype.rcdata = goog.abstractMethod;
<a name="line593"></a>
<a name="line594"></a>
<a name="line595"></a>/**
<a name="line596"></a> * Handler called when CDATA is found.
<a name="line597"></a> * @param {string} text The CDATA text found.
<a name="line598"></a> */
<a name="line599"></a>goog.string.html.HtmlSaxHandler.prototype.cdata = goog.abstractMethod;
<a name="line600"></a>
<a name="line601"></a>
<a name="line602"></a>/**
<a name="line603"></a> * Handler called when the parser is starting to parse the document.
<a name="line604"></a> */
<a name="line605"></a>goog.string.html.HtmlSaxHandler.prototype.startDoc = goog.abstractMethod;
<a name="line606"></a>
<a name="line607"></a>
<a name="line608"></a>/**
<a name="line609"></a> * Handler called when the parsing is done.
<a name="line610"></a> */
<a name="line611"></a>goog.string.html.HtmlSaxHandler.prototype.endDoc = goog.abstractMethod;
</pre>


</body>
</html>
